Basic R

Cast dates

start_date <- "2017-01-01"
end_date <- "2019-12-31"
f1<-function(d2, d1){
  n_weeks <-  floor(as.numeric(difftime(d2, d1, units="weeks")))
}
f2<-function(d2, d1){
  n_weeks <- floor(as.numeric(difftime(as.Date(d2)
    , as.Date(d1), units = "weeks")))
}
m1<-microbenchmark(
  Nocast = f1(end_date, start_date),
  Cast = f2(end_date, start_date),
  times = 1000
)

print(m1)
## Unit: microseconds
##    expr     min       lq     mean  median      uq      max neval
##  Nocast 383.075 390.7645 419.8791 397.262 419.187 3225.200  1000
##    Cast 127.117 131.2550 141.7413 133.129 142.541 2525.855  1000
fig <- fbox_plot(m1, "microseconds")
fig

Explicit vector length vector(“type”, length) is faster than an empty vector c()

create_c <- function (n){
  x <- c()
  for (i in seq(n)) {
    x <- c(x, i)
  }
}
create_vector <- function (n){
  x <- vector("integer", n)
  for (i in seq(n)) {
    x[i] <- i
  }
}
m3 <- microbenchmark(
  with_c = create_c(1e4),
  with_vector = create_vector(1e4),
  times = 10
)
print(m3)
## Unit: microseconds
##         expr      min        lq       mean   median        uq       max neval
##       with_c 65955.37 66058.682 69468.1536 66148.52 75155.912 81450.160    10
##  with_vector   342.60   345.975   622.1023   353.48   371.143  3024.906    10
fig <- fbox_plot(m3, "microseconds")
fig

which function is slow for some simple situations

vector <- runif(1e8)
w1 <- function(x){
  d <- length(which(x > .5))
}
w2 <- function(x){
  d <- sum(x > .5)
}

m4 <- microbenchmark(
  which = w1(vector),
  nowhich = w2(vector),
  times = 10
)
print(m4)
## Unit: milliseconds
##     expr      min       lq     mean   median       uq      max neval
##    which 624.4498 626.7169 644.8899 630.9330 632.9892 708.7845    10
##  nowhich 217.9415 219.1952 236.0176 223.0684 224.5313 299.7681    10
fig <- fbox_plot(m4, "miliseconds")
fig

Column operation is faster than row operation

n <- 1e4
dt <- data.table(
  a = seq(n), b = runif(n)
)
v1 <- function(dt){
  d <- mean(dt[dt$b > .5, ]$a)
}
v2 <- function(dt){
  d <- mean(dt$a[dt$b > .5])
}
m5 <- microbenchmark(
  row_operation = v1(dt),
  column_operation = v2(dt),
  times = 10
)
print(m5)
## Unit: microseconds
##              expr     min      lq     mean   median      uq      max neval
##     row_operation 163.615 168.364 893.0288 176.7500 195.946 5314.840    10
##  column_operation  58.429  65.021 268.5103  69.2095  77.315 2056.819    10
fig <- fbox_plot(m5, "microseconds")
fig

Sequences function safer than 1:n

The function seq prevents when the second part of the 1:x is zero

num <- 1e7
s1 <- function(num){
  d <- mean(1:num)
}
s2 <- function(num){
  d <- mean(seq(num))
}
m6<-microbenchmark(
  noseq = s1(num),
  seq = s2(num),
  times = 30
)
print(m6)
## Unit: milliseconds
##   expr      min       lq     mean   median       uq      max neval
##  noseq 69.83089 69.88638 69.96562 69.91265 69.95498 71.41214    30
##    seq 69.84399 69.91782 69.99335 69.95550 69.98267 71.35749    30
fig <- fbox_plot(m6, "miliseconds")
fig

paste0 is faster than glue

large_dataset <- data.table(
  id = 1:1000000,
  value = sample(letters, 1000000, replace = TRUE)
)
a1 <- function(x){
  d <- x %>% mutate(code = paste0(id, "_", value))
}
a2 <- function(x){
  d <- x %>% mutate(code = glue("{id}_{value}"))
}
m7 <- microbenchmark(
  with_paste = a1(large_dataset),
  with_glue = a2(large_dataset),
  times = 20
)
print(m7)
## Unit: milliseconds
##        expr      min       lq     mean  median       uq      max neval
##  with_paste 552.6167 557.1574 562.2372 560.558 563.8308 594.2514    20
##   with_glue 573.0987 577.9873 599.4778 580.664 583.7402 957.0961    20
fig <- fbox_plot(m7, "miliseconds")
fig

for loop vs lapply

# Example data
data <- data.table(group = rep(seq(10), each = 100), value = rnorm(1000))
print(table(data$group))
## 
##   1   2   3   4   5   6   7   8   9  10 
## 100 100 100 100 100 100 100 100 100 100
# Using a for loop
for_loop_function <- function(data) {
  res <- list()
  unique_groups <- unique(data$group)
  for(this_group in unique_groups) {
    res[[this_group]] <- data %>% filter(group == this_group)
  }
  return(res)
}
sapply_function <- function(data){
  unique_groups <- unique(data$group)
  res <- list()
  sapply(unique_groups, function(this_group){
    res[[this_group]] <<- data %>% filter(group == this_group)
  })
  return(res)
}

m8 <- microbenchmark(
  for_loop = for_loop_function(data),
  sapply = sapply_function(data),
  times = 500
)

print(m8)
## Unit: milliseconds
##      expr      min       lq     mean   median       uq      max neval
##  for_loop 6.498610 6.629428 7.017537 6.699711 6.775863 51.19699   500
##    sapply 6.565846 6.693649 6.941466 6.761019 6.844546 15.76947   500
fig <- fbox_plot(m8, "miliseconds")
fig

data.table package functions

Date vs IDate

## Unit: microseconds
##   expr      min        lq      mean   median        uq      max neval
##   Date 1445.498 1491.2485 1738.5549 1532.510 1863.5890 3916.971   200
##  iDate  571.246  591.3335  682.4735  618.344  654.5415 2561.762   200
fig <- fbox_plot(m9, "miliseconds")
fig

Base R switch vs Dplyr case_when (for simple tasks)

switch_function <- function(x) {
  switch(x,
         "a" = "apple",
         "b" = "banana",
         "c" = "cherry",
         "default")
}
case_when_function <- function(x) {
  case_when(
    x == "a" ~ "apple",
    x == "b" ~ "banana",
    x == "c" ~ "cherry",
    TRUE ~ "default"
  )
}
# Create a vector of test values
test_values <- sample(c("a", "b", "c", "d"), 1000, replace = TRUE)
m10 <- microbenchmark(
  switch = sapply(test_values, switch_function),
  case_when = sapply(test_values, case_when_function),
  times = 200L
)
print(m10)
## Unit: microseconds
##       expr        min         lq        mean    median         uq        max
##     switch    630.266    639.604    664.4921    645.39    657.512   2150.794
##  case_when 226570.715 233691.301 235931.5762 235607.96 236796.573 331029.292
##  neval
##    200
##    200
fig <- fbox_plot(m10, "microseconds")
fig

data.table fcase vs Dplyr case_when

set.seed(123)
n <- 1e6
data <- data.table(
  id = seq(n),
  value = sample(seq(100), n, replace = TRUE)
)

casewhenf <- function(data){
  df <- data %>%
    mutate(category = case_when(
      value <= 20 ~ "Low",
      value <= 70 ~ "Medium",
      value > 70 ~ "High"))
}
fcasef <- function(data){
  df <- data %>%
    mutate(category = fcase(
      value <= 20, "Low",
      value <= 70, "Medium",
      value > 70, "High"))
}
m11 <- microbenchmark(
  case_when = casewhenf(data),
  fcase = fcasef(data),
  times = 20
)
print(m11)
## Unit: milliseconds
##       expr      min       lq     mean   median       uq      max neval
##  case_when 61.27768 61.57658 65.58367 61.70647 69.79883 80.42101    20
##      fcase 21.91152 21.99503 22.94816 22.07832 22.39973 28.21466    20
fig <- fbox_plot(m11, "miliseconds")
fig

data.table fcoalesce vs tidyr replace_na

set.seed(123)
DT <- data.table(
  ID = 1:1e6,
  Value1 = sample(c(NA, 1:100), 1e6, replace = TRUE),
  Value2 = sample(c(NA, 101:200), 1e6, replace = TRUE)
)

# Define the functions
replace_na_f <- function(data){
  DF <- data %>%
    mutate(Value1 = replace_na(Value1, 0),
           Value2 = replace_na(Value2, 0)) %>%
    as.data.table()
}
fcoalesce_f <- function(data){
  DF <- data %>%
    mutate(Value1 = fcoalesce(Value1, 0L),
           Value2 = fcoalesce(Value2, 0L))
}
m12 <- microbenchmark(
  treplace_na = replace_na_f(DT),
  tfcoalesce = fcoalesce_f(DT),
  times = 20
)
print(m12)
## Unit: milliseconds
##         expr      min       lq     mean   median       uq       max neval
##  treplace_na 7.431031 7.547438 8.065811 7.763336 8.346253 10.204548    20
##   tfcoalesce 1.530096 1.609288 2.064029 1.914384 2.290365  3.994967    20
fig <- fbox_plot(m12, "miliseconds")
fig

data.table notation vs dplyr notation

dt <- data.table(field_name = c("argentina.blue.man.watch", 
                                "brazil.red.woman.shoes", 
                                "canada.green.kid.hat", 
                                "denmark.red.man.shirt"))

# Filter rows where 'field_name' does not contain 'red'
dtnot <- function(data){
  filtered_dt <- data |> _[!grepl("red", field_name)]
}
dplyrnot <- function(data){
  filtered_dt <- data %>% filter(!grepl("red", field_name))
}

m13 <- microbenchmark(
  tdtnot = dtnot(dt),
  tdplyrnot = dplyrnot(dt),
  times = 100
)
print(m13)
## Unit: microseconds
##       expr     min       lq     mean   median       uq      max neval
##     tdtnot 101.399 111.2125 148.0785 131.6110 139.0645 1964.697   100
##  tdplyrnot 668.067 692.6130 768.1297 706.5535 732.0465 3121.225   100
fig <- fbox_plot(m13, "microseconds")
fig

data.table melt vs tidyr pivot_longer

large_data <- data.table(
  id = 1:100000,
  var1 = rnorm(100000),
  var2 = rnorm(100000),
  var3 = rnorm(100000),
  var4 = rnorm(100000)
)
# Benchmarking
m14 <- microbenchmark(
  tidyr_pivot_longer = {
    long_data_tidyr <- pivot_longer(large_data, cols = starts_with("var"), 
                                    names_to = "variable", values_to = "value")
  },
  data_table_melt = {
    long_data_dt <- melt(large_data, id.vars = "id", variable.name = "variable", 
                         value.name = "value")
  },
  times = 10
)

print(m14)
## Unit: microseconds
##                expr      min       lq      mean   median       uq       max
##  tidyr_pivot_longer 6280.974 6354.651 7954.2870 6394.480 6553.002 21716.064
##     data_table_melt  463.776  501.196  574.8751  528.767  635.767   796.366
##  neval
##     10
##     10
fig <- fbox_plot(m14, "microseconds")
fig

data.table CJ vs tidyr expand_grid

vec1 <- seq(1000)
vec2 <- seq(1000)

# Define functions to be benchmarked
expand_grid_func <- function() {
  return(expand_grid(vec1, vec2))
}

CJ_func <- function() {
  return(CJ(vec1, vec2))
}

# Perform benchmarking
m15 <- microbenchmark(
  expand_grid = expand_grid_func(),
  CJ = CJ_func(),
  times = 10
)

print(m15)
## Unit: microseconds
##         expr      min       lq      mean    median       uq      max neval
##  expand_grid 2185.580 2193.684 2431.7829 2291.2515 2348.754 3447.254    10
##           CJ  364.982  372.565  597.7959  431.6355  532.104 1786.695    10
fig <- fbox_plot(m15, "microseconds")
fig

data.table rbindlist vs R rbind

# Sample data
size = 1e4
set.seed(44)
df_list <- replicate(50, data.table(id = sample(seq(size), size, replace = T),
                                    value = rnorm(size)), simplify = F)

simple_bind <- function(list_of_dfs){
  do.call(rbind, list_of_dfs)
}

dplyr_bind <- function(list_of_dfs){
  bind_rows(list_of_dfs)
}

dt_bind <- function(list_of_dfs){
  rbindlist(list_of_dfs, fill = F)
}

# Benchmark both methods
m16 <- microbenchmark(
  dt_ver = dt_bind(df_list),
  simple = simple_bind(df_list),
  dplyr_ver = dplyr_bind(df_list),
  times = 30
)

print(m16)
## Unit: microseconds
##       expr       min        lq       mean     median        uq       max neval
##     dt_ver   452.044   502.128   597.9160   536.5765   577.137  1880.661    30
##     simple   488.482   536.932   621.2624   561.7235   613.405  1816.060    30
##  dplyr_ver 10104.831 10180.302 10707.3023 10318.7155 10517.521 20018.124    30
fig <- fbox_plot(m16, "microseconds")
fig

stringr word vs tidyr separate vs data.table tstrsplit

set.seed(123)
n <- 1e4
df <- data.table(text = paste("word1", "word2", "word3", "word4", "word5", sep = "."), stringsAsFactors = F)
df <- df[rep(1, n), , drop = F]

# Using tidyr::separate
separate_words <- function() {
  df |> 
    separate(text, into = c("w1", "w2", "w3", "w4", "w5"), sep = "\\.", remove = F) |> 
    select(-c(w1, w2, w4))
}

# Using stringr::word
stringr_words <- function() {
  df |> 
    mutate(
      w3 = word(text, 3, sep = fixed(".")),
      w5 = word(text, 5, sep = fixed("."))
    )
}

datatable_words <- function() {
  df |> 
    _[, c("w3", "w5") := tstrsplit(text, "\\.")[c(3, 5)]]
}

m17 <- microbenchmark(
  separate = separate_words(),
  stringr = stringr_words(),
  dt = datatable_words(),
  times = 10
)

print(m17)
## Unit: milliseconds
##      expr       min        lq      mean    median        uq       max neval
##  separate  76.67471  83.39636  86.27289  87.43375  90.26654  94.07545    10
##   stringr 171.25134 174.47261 192.55688 183.20939 191.68623 282.26948    10
##        dt  12.64523  12.83104  13.20490  12.86673  13.09131  15.41465    10
fig <- fbox_plot(m17, "miliseconds")
fig

Parallel processing

lapply vs parallel mclapply

# Sample data
size = 1e4
n_cores = parallelly::availableCores()
set.seed(123)

df_list <- replicate(100, data.table(id = sample(seq(size), size, replace = T),
                                    value = rnorm(size)), simplify = F)
extra_df <- data.table(id = sample(seq(size), size, replace = T), 
                       extra_value = runif(size))

# Sequential join
sequential_join <- function() {
  lapply(df_list, function(df) {
    merge(df, extra_df, by = "id", allow.cartesian = T)
  })
}

# Parallel join using mclapply
parallel_join <- function() {
  mclapply(df_list, function(df) {
    merge(df, extra_df, by = "id", allow.cartesian = T)
  }, mc.cores = n_cores, mc.silent = T, mc.cleanup = T)
}

# Benchmark both methods
m18 <- microbenchmark(
  sequential = sequential_join(),
  parallel = parallel_join(),
  times = 10
)

print(m18)
## Unit: milliseconds
##        expr      min       lq     mean   median       uq      max neval
##  sequential 272.6641 276.9203 307.4911 290.0377 355.3227 368.1817    10
##    parallel 130.8102 134.8705 141.4624 139.9576 146.2125 157.3999    10
fig <- fbox_plot(m18, "miliseconds")
fig